library(readxl)
library(sf)
library(dplyr)
library(lme4)
library(broom.mixed)
library(ggplot2)
library(dplyr)
library(purrr)
library(boot)
library(tidyverse)
library(scales)
options(scipen = 999)

# Load Data
# Read in the DUN Demography and Election Results data
dun_demography_data <- read_excel("DUN Demography and Election Results 2018.xlsx")
dun_boundaries <- st_read("DUN (2020 - PRESENT)/MALAYSIA_DUN_2020_FOR_UMICH.shp")
parliament_boundaries <- st_read("PARLIAMENT (2020 - PRESENT)/MALAYSIA_PAR_2020_FOR_UMICH.shp")
parliament_boundaries_filtered <- parliament_boundaries %>%
  filter(!(NEGERI %in% c("SARAWAK", "SABAH","WP LABUAN","WP KUALA LUMPUR", "WP PUTRAJAYA")))

dun_boundaries_filtered <- dun_boundaries %>%
  filter(!(NEGERI %in% c("SARAWAK", "SABAH")))

demographic_data <- read_excel("demographic_and_election_parliament_18data.xlsx")
parliament_boundaries_filtered_df <- as.data.frame(parliament_boundaries_filtered)
matched_data <- left_join(parliament_boundaries_filtered_df, demographic_data, by = "KODPAR")
matched_data_sf <- st_as_sf(matched_data, sf_column_name = "geometry", crs = st_crs(parliament_boundaries_filtered))

# Convert dun_boundaries_filtered to a data frame for joining
dun_boundaries_filtered_df <- as.data.frame(dun_boundaries_filtered)

# Perform the join based on KODPAR and KODDUN
matched_dun_data <- left_join(dun_boundaries_filtered_df, dun_demography_data, by = c("KODPAR","KODDUN"))

dun_aggregated_to_parliament <- matched_dun_data %>%
  group_by(KODPAR)

parliament_with_dun_data <- left_join(matched_data_sf, dun_aggregated_to_parliament, by = "KODPAR")

# Variable Transformation
parliament_with_dun_data <- parliament_with_dun_data %>%
  mutate(Malay_Share_Diff = Malay_Share_Dun - Malay.Share,
         PH_Share_Diff = PH_Share_Dun - PH.Share,
         BN_Share_Diff = BN_Share_Dun - BN.Share,
         Population_Density_Diff = population_density_dun - population_density)

matched_dun_data_sf <- st_as_sf(
  parliament_with_dun_data, 
  sf_column_name = "geometry.y", 
  crs = st_crs(dun_boundaries_filtered)
)

matched_dun_data_sf <- matched_dun_data_sf %>%
  mutate(Pop_Diff_std = (Population_Density_Diff - mean(Population_Density_Diff, na.rm = TRUE)) /
           sd(Population_Density_Diff, na.rm = TRUE))

summary(matched_dun_data_sf$Population_Density_Diff)

ggplot(matched_dun_data_sf) +
  geom_sf(aes(fill = Population_Density_Diff), color = NA) +
  scale_fill_gradient2(
    low = "darkblue", mid = "white", high = "darkred",
    midpoint = 0,
    trans = pseudo_log_trans(sigma = 0.03), 
    limits = c(-4, 16), oob = squish,
    breaks = c(-4, 0, 4, 16), 
    name = "Population Density Difference"
  ) +
  theme_minimal() +
  theme(
    text = element_text(size = 12),             # base font size for all text
    axis.title = element_text(size = 12),       # axis titles
    axis.text = element_text(size = 8),        # axis tick labels
    legend.title = element_text(size = 8),     # legend title
    legend.text = element_text(size = 8)       # legend labels
  ) +
  guides(
    fill = guide_colorbar(
      barwidth = 1,       # length of the color bar
      barheight = 4,     # thickness
      title.position = "top",  # move title above the bar
      title.hjust = 0.5        # center title
    )
  )


# Bootstrap
re_model3 <- lmer(
  BN_Share_Diff ~ Malay_Share_Diff + Population_Density_Diff + 
    (Population_Density_Diff | state.y),
  data = matched_dun_data_sf
)

# Original fixed + random effects slopes
re_df   <- ranef(re_model3, condVar = TRUE)$state.y
fixed_slope  <- fixef(re_model3)["Population_Density_Diff"]
state_slope  <- fixed_slope + re_df[, "Population_Density_Diff"]
state_var <- rownames(re_df)

get_bootstrap_slopes_raw <- function(data, state_var, n_boot = 100000,
                                     structure = c("uncorrelated","correlated","intercept_only")) {
  structure <- match.arg(structure)
  
  # ensure factor grouping
  data <- data %>% mutate(!!sym(state_var) := factor(!!sym(state_var)))
  states <- levels(data[[state_var]])
  
  # choose a safer random-effects structure
  re_term <- switch(
    structure,
    "uncorrelated" = paste0("(0 + Population_Density_Diff | ", state_var, ") + (1 | ", state_var, ")"),
    "correlated"   = paste0("(1 + Population_Density_Diff | ", state_var, ")"),
    "intercept_only" = paste0("(1 | ", state_var, ")")
  )
  
  frm <- as.formula(paste0(
    "BN_Share_Diff ~ Population_Density_Diff + Malay_Share_Diff + ", re_term
  ))
  
  # use map() + list_rbind() to avoid deprecation noise
  res_list <- map(seq_len(n_boot), function(i) {
    sampled <- sample(states, length(states), replace = TRUE)
    boot_sample <- map(sampled, ~ data %>% filter(!!sym(state_var) == .x)) |> list_rbind()
    boot_sample[[state_var]] <- factor(boot_sample[[state_var]])
    
    # muffle warnings safely
    model_raw <- withCallingHandlers(
      tryCatch(
        lmer(frm, data = boot_sample,
             control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 1e6))),
        error = function(e) NULL
      ),
      warning = function(w) invokeRestart("muffleWarning")
    )
    if (is.null(model_raw)) return(NULL)
    
    fixed_slope <- unname(fixef(model_raw)[["Population_Density_Diff"]])
    if (is.na(fixed_slope)) return(NULL)
    
    re_df <- ranef(model_raw)[[state_var]] %>% rownames_to_column(var = "group")
    
    tibble(
      group = re_df$group,
      pop_density_slope_raw = re_df[["Population_Density_Diff"]] %||% NA_real_,
      total_slope_raw = fixed_slope + pop_density_slope_raw,
      iter = i
    )
  })
  
  list_rbind(res_list)
}

set.seed(123)
bootstrap_slopes_raw <- suppressMessages(
  suppressWarnings(get_bootstrap_slopes_raw(
    matched_dun_data_sf,
    state_var = "state.y",
    n_boot = 100000,                
    structure = "uncorrelated"  
)))

# Calculate bootstrapped SEs (Table 4)
boot_summary_raw <- bootstrap_slopes_raw %>%
  group_by(group) %>%
  summarise(
    mean_slope = mean(total_slope_raw, na.rm = TRUE),
    se   = sd(total_slope_raw, na.rm = TRUE),
    lower = mean_slope - 1.96 * se,
    upper = mean_slope + 1.96 * se,
    .groups = "drop"
  )

# Figure 8
ggplot(boot_summary_raw, aes(x = reorder(group, mean_slope), y = mean_slope)) +
  geom_point(color = "#2A4B7C", size = 2) +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.3, color = "#2A4B7C") +
  labs(
    #title = "Effect of Population Density on BN Support by State (Bootstrapped)",
    #subtitle = "Slope from mixed-effects model with bootstrapped standard errors",
    x = "State",
    y = "Slope (ΔBN.Share per unit of Population Density)"
  ) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "grey70") +
  theme_minimal(base_family = "Times New Roman") +
  coord_flip() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.text.y = element_text(size = 10)
  )
